In [1]:
    
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import cross_validation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve, roc_curve
    
In [2]:
    
messages = pd.read_csv('SMSSpamCollection.tsv', sep='\t', header=None, names=['label', 'text'])
    
In [3]:
    
messages.iloc[0].text
    
    Out[3]:
In [4]:
    
cv = CountVectorizer()
X = cv.fit_transform(messages[['text']].as_matrix().ravel()).todense()
y = (messages[['label']] == 'spam').as_matrix().ravel().astype(int)
    
In [5]:
    
X_example = cv.transform(['crazy crazy how']).todense()
    
In [6]:
    
X_example[0].max()
    
    Out[6]:
In [7]:
    
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3)
    
In [ ]:
    
    
In [8]:
    
p_spam = np.sum(y_train) / y_train.shape[0]
p_ham = 1 - p_spam
    
In [9]:
    
# From X_train, choose only those rows (messages) that are labeled as spam.
spam_messages = X_train[y_train.astype(bool)]
# For each word (column), sum over all rows.
spam_counts = np.sum(spam_messages, axis=0)
p_words_spam = np.ravel((spam_counts + 1) / (spam_counts.sum() + 2))
    
In [10]:
    
spam_counts.shape
    
    Out[10]:
In [11]:
    
ham_messages = X_train[np.logical_not(y_train.astype(bool))]
ham_counts = np.sum(ham_messages, axis=0)
p_words_ham = np.ravel((ham_counts + 1) / (ham_counts.sum() + 2))
    
In [12]:
    
def predict(msg):
    msg = np.ravel((msg != 0))
    
    p_x_spam = np.prod(p_words_spam[msg]) * p_spam
    p_x_ham = np.prod(p_words_ham[msg]) * p_ham
    p_x = p_x_spam * p_spam + p_x_ham * p_ham
    p_is_spam = p_x_spam * p_spam / p_x
    return p_is_spam
    
In [13]:
    
y_pred = np.apply_along_axis(predict, 1, X_test)
    
In [14]:
    
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
plt.figure()
ax = plt.subplot(111)
plt.xlabel('threshold')
plt.plot(thresholds, precision[:-1], label='precision')
plt.plot(thresholds, recall[:-1], label='recall')
ax.legend(bbox_to_anchor=(1.0, 0.8))
plt.show()
    
    
In [15]:
    
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.figure()
ax = plt.subplot(111)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.plot(fpr, tpr)
plt.show()
    
    
In [16]:
    
def is_spam(msg):
    if predict(msg) > 0.2:
        return 1
    else:
        return 0
    
In [17]:
    
y_pred = np.apply_along_axis(is_spam, 1, X_test)
    
In [18]:
    
print("Spam precision: {0:.1f}%".format(precision_score(y_pred, y_test) * 100))
print("Spam recall: {0:.1f}%".format(recall_score(y_pred, y_test) * 100))
    
    
In [19]:
    
from sklearn.metrics import matthews_corrcoef
matthews_corrcoef(y_pred, y_test)
    
    Out[19]:
In [20]:
    
def is_spam_text(text):
    x = np.ravel(cv.transform([text]).todense())
    return predict(x)